import pandas as pd
import matplotlib.pyplot as plt
ratings = pd.read_csv("FilmData2.csv")
ratings.head()
| Hulu | Netflix | Amazon | DisneyPlus | type | title | release_year | region | language | tconst | ... | primaryTitle | originalTitle | isAdult | startYear | endYear | runtimeMinutes | genres | tconst-1 | averageRating | numVotes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | US | \N | tt4048280 | ... | RED WINDOWS | Red Windows | 0 | 2016 | \N | NaN | Thriller | tt4048280 | 2.5 | 21 |
| 1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | \N | \N | tt4048280 | ... | RED WINDOWS | Red Windows | 0 | 2016 | \N | NaN | Thriller | tt4048280 | 2.5 | 21 |
| 2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | GB | \N | tt3335210 | ... | WHAT GOES UP | What Goes Up | 0 | 2014 | \N | 87.0 | Comedy,Romance,Sci-Fi | tt3335210 | 2.5 | 23 |
| 3 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | \N | \N | tt3335210 | ... | WHAT GOES UP | What Goes Up | 0 | 2014 | \N | 87.0 | Comedy,Romance,Sci-Fi | tt3335210 | 2.5 | 23 |
| 4 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | US | \N | tt3335210 | ... | WHAT GOES UP | What Goes Up | 0 | 2014 | \N | 87.0 | Comedy,Romance,Sci-Fi | tt3335210 | 2.5 | 23 |
5 rows × 21 columns
vals = ratings[(ratings['titleType']=='movie') & (ratings['genres']!='\\N') & (ratings['region']=='US')]
vals = vals[vals['numVotes']>100]
vals.head()
| Hulu | Netflix | Amazon | DisneyPlus | type | title | release_year | region | language | tconst | ... | primaryTitle | originalTitle | isAdult | startYear | endYear | runtimeMinutes | genres | tconst-1 | averageRating | numVotes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 47 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | US | \N | tt4412218 | ... | RUNNING MAN | Benpao Ba! Xiongdi | 0 | 2015 | \N | 88.0 | Action,Comedy,Documentary | tt4412218 | 2.5 | 115 |
| 55 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | US | \N | tt4253170 | ... | THE APOSTLES | Gui zhen | 0 | 2013 | \N | 91.0 | Fantasy,Horror,Sci-Fi | tt4253170 | 2.5 | 133 |
| 58 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | US | \N | tt4679576 | ... | BLOODHOUND | Bloodhound | 0 | 2020 | \N | 72.0 | Crime,Drama | tt4679576 | 2.5 | 135 |
| 65 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | US | \N | tt4591226 | ... | THE LAST HOUSE | The Last House | 0 | 2015 | \N | 91.0 | Horror | tt4591226 | 2.5 | 155 |
| 67 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | US | \N | tt2757592 | ... | A TALKING PONY!?! | A Talking Pony!?! | 0 | 2013 | \N | 88.0 | Comedy,Family,Fantasy | tt2757592 | 2.5 | 184 |
5 rows × 21 columns
genres = []
for genre in vals["genres"]:
# split all
_temp = genre.split(",")
for item in _temp:
if item not in genres:
genres.append(item)
genres
['Action', 'Comedy', 'Documentary', 'Fantasy', 'Horror', 'Sci-Fi', 'Crime', 'Drama', 'Family', 'Thriller', 'Mystery', 'Adventure', 'Animation', 'Romance', 'Sport', 'Music', 'Biography', 'Western', 'War', 'Reality-TV', 'History', 'Musical', 'News', 'Talk-Show', 'Adult', 'Short']
df = vals['averageRating']
df.shape
(57592,)
import plotly
import plotly.express as px
fig = px.histogram(data_frame=vals, x=vals["averageRating"], title="IMDB Scores of the Programs")
fig.show()
fig = px.box(data_frame=vals, x=vals["averageRating"])
fig.update_traces(overwrite=False)
fig.show()
vals[["averageRating", "runtimeMinutes"]].corr()
| averageRating | runtimeMinutes | |
|---|---|---|
| averageRating | 1.000000 | 0.025417 |
| runtimeMinutes | 0.025417 | 1.000000 |
top_10_ratings = vals[["averageRating", "title", "genres", "startYear"]].sort_values(["averageRating"], ascending=False)[:10]
top_10_ratings
fig = px.scatter(top_10_ratings, y= 'title', x='averageRating', hover_data = top_10_ratings[['genres','startYear']], color='genres',
title = "Top 10 High Rated Programs")
fig.show()
vals[["averageRating", "startYear"]].corr()
| averageRating | startYear | |
|---|---|---|
| averageRating | 1.000000 | -0.016972 |
| startYear | -0.016972 | 1.000000 |
plotdata = pd.DataFrame(vals["averageRating"])
vals.groupby('startYear', as_index=False)['averageRating'].mean()
fig = px.bar(data_frame=vals, x=vals["startYear"],y=vals["averageRating"])
fig.show()
df2 = vals.groupby('startYear', as_index=False)['averageRating'].mean()
fig = px.bar(data_frame=df2, x=df2["startYear"],y=df2["averageRating"])
fig.show()
df3 = vals.groupby('genres', as_index=False)['averageRating'].mean()
fig = px.bar(data_frame=df3, x=df3["genres"],y=df3["averageRating"])
fig.show()